Skip to contents
library(babynamesIL)
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#>  ggplot2 3.3.6      purrr   0.3.4
#>  tibble  3.1.8      dplyr   1.0.9
#>  tidyr   1.2.0      stringr 1.4.0
#>  readr   2.1.2      forcats 0.5.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#>  dplyr::filter() masks stats::filter()
#>  dplyr::lag()    masks stats::lag()
library(tgstat)
theme_set(theme_classic())

Israeli baby names

Distribution of names

We will start by looking at the distribution total number of babies for each name:

babynamesIL_totals %>%
  mutate(sector = factor(sector, levels = c("Jewish", "Muslim", "Christian", "Druze", "Other"))) %>%
  ggplot(aes(x = total, color = sex)) +
  ggsci::scale_color_aaas() +
  geom_density() +
  scale_x_log10() +
  facet_grid(. ~ sector) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Note that the x axis is in log scale.

Top names

Top 20 names in each sex and sector:

babynamesIL_totals %>%
  mutate(sector = factor(sector, levels = c("Jewish", "Muslim", "Christian", "Druze", "Other"))) %>%
  group_by(sector, sex) %>%
  slice_max(order_by = total, n = 20) %>%
  arrange(sector, sex, desc(total)) %>%
  mutate(name = forcats::fct_inorder(name)) %>%
  ggplot(aes(x = name, y = total)) +
  geom_col() +
  facet_wrap(sector ~ sex, scales = "free", ncol = 2) +
  ylab("total #") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Names over time

a single name

babynamesIL %>%
  tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>%
  filter(name == "מעיין", sector == "Jewish") %>%
  ggplot(aes(x = year, y = prop, color = sex)) +
  geom_line() +
  ggsci::scale_color_aaas() +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("מעיין הבן מול מעיין הבת") +
  theme_classic()

clustering

We will then create a matrix of the names and their frequencies over time. We will start with Jewish female babies.

names_mat <- babynamesIL %>%
  filter(sector == "Jewish", sex == "F") %>%
  select(year, name, prop) %>%
  spread(year, prop, fill = 0) %>%
  column_to_rownames("name") %>%
  as.matrix()
dim(names_mat)
#> [1] 1478   74

Normalize each name:

mat_norm <- names_mat / rowSums(names_mat)

Select only names with at least 500 babies:

mat_norm_f <- mat_norm[babynamesIL_totals %>%
  filter(sector == "Jewish", sex == "F") %>%
  filter(total >= 500) %>%
  pull(name), ]
dim(mat_norm_f)
#> [1] 485  74

Cluster:

hc <- tgs_cor(t(mat_norm_f)) %>%
  tgs_dist() %>%
  hclust(method = "ward.D2")

Reorder the clustering by year:

hc <- as.hclust(reorder(
  as.dendrogram(hc),
  apply(mat_norm_f, 1, which.max),
  agglo.FUN = mean
))

Plot the matrix:

text_mat <- babynamesIL %>%
  filter(sector == "Jewish", sex == "F") %>%
  tidyr::complete(sector, year, sex, name, fill = list(n = 0)) %>%
  mutate(text = paste(name, paste0("year: ", year), paste0("n: ", n), sep = "\n")) %>%
  select(year, name, text) %>%
  spread(year, text) %>%
  column_to_rownames("name") %>%
  as.matrix()
plotly::plot_ly(z = mat_norm_f[hc$order, ], y = rownames(mat_norm_f)[hc$order], x = colnames(mat_norm_f), type = "heatmap", colors = colorRampPalette(c("white", "blue", "red", "yellow"))(1000), hoverinfo = "text", text = text_mat[hc$order, ]) %>%
  plotly::layout(yaxis = list(title = ""), xaxis = list(title = "Year"))

We will wrap it all in a function:

cluster_names <- function(sector, sex, min_total = 500, colors = colorRampPalette(c
                          ("white", "blue", "red", "yellow"))(1000)) {
  names_mat <- babynamesIL %>%
    filter(sector == !!sector, sex == !!sex) %>%
    select(year, name, prop) %>%
    spread(year, prop, fill = 0) %>%
    column_to_rownames("name") %>%
    as.matrix()
  text_mat <- babynamesIL %>%
    filter(sector == !!sector, sex == !!sex) %>%
    tidyr::complete(sector, year, sex, name, fill = list(n = 0)) %>%
    mutate(text = paste(name, paste0("year: ", year), paste0("n: ", n), sep = "\n")) %>%
    select(year, name, text) %>%
    spread(year, text) %>%
    column_to_rownames("name") %>%
    as.matrix()
  mat_norm <- names_mat / rowSums(names_mat)
  mat_norm_f <- mat_norm[babynamesIL_totals %>%
    filter(sector == !!sector, sex == !!sex) %>%
    filter(total >= min_total) %>%
    pull(name), ]
  text_mat <- text_mat[rownames(mat_norm_f), colnames(mat_norm_f)]
  hc <- tgs_cor(t(mat_norm_f)) %>%
    tgs_dist() %>%
    hclust(method = "ward.D2")
  hc <- as.hclust(reorder(
    as.dendrogram(hc),
    apply(mat_norm_f, 1, which.max),
    agglo.FUN = mean
  ))
  plotly::plot_ly(z = mat_norm_f[hc$order, ], y = rownames(mat_norm_f)[hc$order], x = colnames(mat_norm_f), type = "heatmap", colors = colors, hoverinfo = "text", text = text_mat[hc$order, ]) %>%
    plotly::layout(yaxis = list(title = ""), xaxis = list(title = "Year"))
}

We can now plot also the Male names:

cluster_names("Jewish", "M")

Or other sectors:

cluster_names("Muslim", "M")
cluster_names("Muslim", "F")
cluster_names("Christian", "M", 50)
cluster_names("Christian", "F", 50)
cluster_names("Druze", "M", 50)
cluster_names("Druze", "F", 50)
cluster_names("Other", "M", 50)
cluster_names("Other", "F", 50)

Unisex names

We can plot names that are used for both male and female in a given year, e.g. 2021:

babynamesIL %>%
  filter(sector == "Jewish", year == 2021) %>%
  pivot_wider(names_from = "sex", values_from = c("n", "prop"), values_fill = 0) %>%
  filter(n_M > 0 & n_F > 0) %>%
  ggplot(aes(x = n_M, y = n_F, label = name)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  ggrepel::geom_text_repel() +
  geom_abline()

Or we can use the matrices we created before to find patterns in the ratio between male and female over time:

cluster_unisex_names <- function(sector, colors = colorRampPalette(c("blue", "white", "red"))(1000), epsilon = 1e-6) {
  mat_M <- babynamesIL %>%    
    filter(sector == !!sector, sex == "M") %>%    
    tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>% 
    select(year, name, prop) %>%
    spread(year, prop, fill = 0) %>%
    column_to_rownames("name") %>%
    as.matrix()
  mat_F <- babynamesIL %>%
    filter(sector == !!sector, sex == "F") %>%
    tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>% 
    select(year, name, prop) %>%
    spread(year, prop, fill = 0) %>%
    column_to_rownames("name") %>%
    as.matrix()
  uni_names <- intersect(rownames(mat_M), rownames(mat_F))
  ratio_mat <- log2(mat_M[uni_names, ] + epsilon) - log2(mat_F[uni_names, ] + epsilon)
  text_mat <- babynamesIL %>%
    filter(sector == !!sector) %>%
    tidyr::complete(sector, year, sex, name, fill = list(n = 0, prop = 0)) %>%
    pivot_wider(names_from = "sex", values_from = c("n", "prop"), values_fill = 0) %>%
    mutate(
      text =
        paste(name,
          paste0("year: ", year),
          paste0("# of male: ", n_M),
          paste0("# of female: ", n_F),
          paste0("% of male: ", scales::percent(prop_M)),
          paste0("% of female: ", scales::percent(prop_F)),
          sep = "\n"
        )
    ) %>%
    select(year, name, text) %>%
    spread(year, text) %>%
    column_to_rownames("name") %>%
    as.matrix()
  text_mat <- text_mat[rownames(ratio_mat), colnames(ratio_mat)]
  colors <- colorRampPalette(c("blue", "white", "red"))(1000)
  hc <- tgs_cor(t(ratio_mat)) %>%
    tgs_dist() %>%
    hclust(method = "ward.D2")
  hc <- as.hclust(reorder(
    as.dendrogram(hc),
    apply(ratio_mat, 1, which.max),
    agglo.FUN = mean
  ))
  n_names <- length(uni_names)
  plotly::plot_ly(z = ratio_mat[hc$order, ], y = rownames(ratio_mat)[hc$order], x = colnames(ratio_mat), type = "heatmap", colors = colors, hoverinfo = "text", text = text_mat[hc$order, ]) %>%
    plotly::layout(title = paste0(n_names, " unisex names from the ", sector, " sector"), yaxis = list(title = ""), xaxis = list(title = "Year"))
}

Run the function - red is more male names and blue is more female names:

cluster_unisex_names("Jewish")
cluster_unisex_names("Muslim")
cluster_unisex_names("Christian")
cluster_unisex_names("Druze")